Jan-Philipp Kolb
9 Mai 2017
Die Struktur der Daten kann man sich mit einem JSON Viewer anschauen
library("jsonlite")
DRINKWATER <- fromJSON("data/RomDrinkingWater.geojson")names(DRINKWATER)[1:3]## [1] "type" "generator" "copyright"
names(DRINKWATER)[4:5]## [1] "timestamp" "features"
head(DRINKWATER$features)## type id properties.@id properties.amenity properties.flow
## 1 Feature node/246574149 node/246574149 drinking_water push-button
## 2 Feature node/246574150 node/246574150 drinking_water <NA>
## 3 Feature node/246574151 node/246574151 drinking_water <NA>
## 4 Feature node/248743324 node/248743324 drinking_water <NA>
## 5 Feature node/251773348 node/251773348 drinking_water <NA>
## 6 Feature node/251773551 node/251773551 drinking_water <NA>
## properties.type properties.name properties.name:fr properties.wheelchair
## 1 nasone <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA>
## 5 nasone <NA> <NA> <NA>
## 6 <NA> Acqua Marcia Eau potable yes
## properties.created_by properties.indoor geometry.type
## 1 <NA> <NA> Point
## 2 <NA> <NA> Point
## 3 <NA> <NA> Point
## 4 <NA> <NA> Point
## 5 <NA> <NA> Point
## 6 <NA> <NA> Point
## geometry.coordinates
## 1 12.49191, 41.89479
## 2 12.49095, 41.89489
## 3 12.48774, 41.89450
## 4 12.48773, 41.89354
## 5 12.48529, 41.88539
## 6 12.48386, 41.89332
my_repos <- fromJSON("https://api.github.com/users/japhilko/repos")names(my_repos)## [1] "id" "name" "full_name"
## [4] "owner" "private" "html_url"
## [7] "description" "fork" "url"
## [10] "forks_url" "keys_url" "collaborators_url"
## [13] "teams_url" "hooks_url" "issue_events_url"
## [16] "events_url" "assignees_url" "branches_url"
## [19] "tags_url" "blobs_url" "git_tags_url"
## [22] "git_refs_url" "trees_url" "statuses_url"
## [25] "languages_url" "stargazers_url" "contributors_url"
## [28] "subscribers_url" "subscription_url" "commits_url"
## [31] "git_commits_url" "comments_url" "issue_comment_url"
## [34] "contents_url" "compare_url" "merges_url"
## [37] "archive_url" "downloads_url" "issues_url"
## [40] "pulls_url" "milestones_url" "notifications_url"
## [43] "labels_url" "releases_url" "deployments_url"
## [46] "created_at" "updated_at" "pushed_at"
## [49] "git_url" "ssh_url" "clone_url"
## [52] "svn_url" "homepage" "size"
## [55] "stargazers_count" "watchers_count" "language"
## [58] "has_issues" "has_projects" "has_downloads"
## [61] "has_wiki" "has_pages" "forks_count"
## [64] "mirror_url" "open_issues_count" "forks"
## [67] "open_issues" "watchers" "default_branch"
library(jsonlite)
res <- fromJSON('http://ergast.com/api/f1/2004/1/results.json')
drivers <- res$MRData$RaceTable$Races$Results[[1]]$Driver
colnames(drivers)## [1] "driverId" "code" "url" "givenName"
## [5] "familyName" "dateOfBirth" "nationality" "permanentNumber"
article_key <- "&api-key=c2fede7bd9aea57c898f538e5ec0a1ee:6:68700045"
url <- "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=obamacare+socialism"
req <- fromJSON(paste0(url, article_key))
articles <- req$response$docs
colnames(articles)## [1] "web_url" "snippet" "lead_paragraph"
## [4] "abstract" "print_page" "blog"
## [7] "source" "multimedia" "headline"
## [10] "keywords" "pub_date" "document_type"
## [13] "news_desk" "section_name" "subsection_name"
## [16] "byline" "type_of_material" "_id"
## [19] "word_count" "slideshow_credits"
XML Paketlibrary(XML)
citation("XML")##
## To cite package 'XML' in publications use:
##
## Duncan Temple Lang and the CRAN Team (2016). XML: Tools for
## Parsing and Generating XML Within R and S-Plus. R package
## version 3.98-1.5. https://CRAN.R-project.org/package=XML
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {XML: Tools for Parsing and Generating XML Within R and S-Plus},
## author = {Duncan Temple Lang and the CRAN Team},
## year = {2016},
## note = {R package version 3.98-1.5},
## url = {https://CRAN.R-project.org/package=XML},
## }
##
## ATTENTION: This citation information has been auto-generated from
## the package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
url <- "http://api.openstreetmap.org/api/0.6/
relation/62422"library(xml2)
BE <- xmlParse(url)Administrative Grenzen Berlin
xmltop = xmlRoot(BE)
class(xmltop)## [1] "XMLInternalElementNode" "XMLInternalNode"
## [3] "XMLAbstractNode"
xmlSize(xmltop)## [1] 1
xmlSize(xmltop[[1]])## [1] 326
Xpath, the XML Path Language, is a query language for selecting nodes from an XML document.
xpathApply(BE,"//tag[@k = 'source:population']")## [[1]]
## <tag k="source:population" v="http://www.statistik-berlin-brandenburg.de/Publikationen/Stat_Berichte/2010/SB_A1-1_A2-4_q01-10_BE.pdf 2010-10-01"/>
##
## attr(,"class")
## [1] "XMLNodeSet"
url2 <- "http://api.openstreetmap.org/api/0.6/node/2923760808"
RennesBa <- xmlParse(url2)url3 <- "http://api.openstreetmap.org/api/0.6/way/72799743"
MadCalle <- xmlParse(url3)http://www.stat.berkeley.edu/~statcur/Workshop2/Presentations/XML.pdf
http://www.di.fc.ul.pt/~jpn/r/web/index.html#parsing-xml
http://www.w3schools.com/xml/xquery_intro.asp
http://giventhedata.blogspot.de/2012/06/r-and-web-for-beginners-part-ii-xml-in.html
http://gastonsanchez.com/Handling_and_Processing_Strings_in_R.pdf
citation("XML")##
## To cite package 'XML' in publications use:
##
## Duncan Temple Lang and the CRAN Team (2016). XML: Tools for
## Parsing and Generating XML Within R and S-Plus. R package
## version 3.98-1.5. https://CRAN.R-project.org/package=XML
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {XML: Tools for Parsing and Generating XML Within R and S-Plus},
## author = {Duncan Temple Lang and the CRAN Team},
## year = {2016},
## note = {R package version 3.98-1.5},
## url = {https://CRAN.R-project.org/package=XML},
## }
##
## ATTENTION: This citation information has been auto-generated from
## the package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
rvestlibrary(rvest)
ht <- read_html('https://www.google.co.in/search?q=guitar+repair+workshop')
links <- ht %>% html_nodes(xpath='//h3/a') %>% html_attr('href')
gsub('/url\\?q=','',sapply(strsplit(links[as.vector(grep('url',links))],split='&'),'[',1))## [1] "http://theguitarrepairworkshop.com/"
## [2] "http://www.guitarservices.com/"
## [3] "http://www.guitarrepairbench.com/guitar-building-projects/guitar-workshop/guitar-workshop-project.html"
## [4] "http://www.guitarrepairbench.com/guitar-building-projects/guitar-workshop/guitar-workshop-layout-design.html"
## [5] "https://www.facebook.com/The-Guitar-Repair-Workshop-847517635259712/"
## [6] "https://www.taylorguitars.com/dealer/guitar-repair-workshop-ltd"
## [7] "http://guitarworkshopglasgow.com/pages/repairs-1"
## [8] "https://www.justdial.com/Delhi-NCR/Guitar-Repair-Services/nct-10988623"
## [9] "https://www.justdial.com/Mumbai/Guitar-Repair-Services/nct-10988623"
install.packages("tidyverse")library(tidyverse)
library(stringr)
library(forcats)
library(ggmap)
library(rvest)html.world_ports <- read_html("https://en.wikipedia.org/wiki/List_of_busiest_container_ports")
df.world_ports <- html_table(html_nodes(html.world_ports, "table")[[2]], fill = TRUE)glimpse(df.world_ports)## Observations: 50
## Variables: 15
## $ Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ Port <chr> "Shanghai", "Singapore", "Shenzhen", "Ningbo-Zhoushan...
## $ Economy <chr> "China", "Singapore", "China", "China", "Hong Kong", ...
## $ 2015[1] <chr> "36,516", "30,922", "24,142", "20,636", "20,073", "19...
## $ 2014[2] <chr> "35,268", "33,869", "23,798", "19,450", "22,374", "18...
## $ 2013[3] <chr> "33,617", "32,240", "23,280", "17,351", "22,352", "17...
## $ 2012[4] <chr> "32,529", "31,649", "22,940", "16,670", "23,117", "17...
## $ 2011[5] <chr> "31,700", "29,937", "22,570", "14,686", "24,384", "16...
## $ 2010[6] <chr> "29,069", "28,431", "22,510", "13,144", "23,532", "14...
## $ 2009[7] <chr> "25,002", "25,866", "18,250", "10,502", "20,983", "11...
## $ 2008[8] <chr> "27,980", "29,918", "21,414", "11,226", "24,248", "13...
## $ 2007[9] <chr> "26,150", "27,932", "21,099", "9,349", "23,881", "13,...
## $ 2006[10] <chr> "21,710", "24,792", "18,469", "7,068", "23,539", "12,...
## $ 2005[11] <chr> "18,084", "23,192", "16,197", "5,208", "22,427", "11,...
## $ 2004[12] <chr> "14,557", "21,329", "13,615", "4,006", "21,984", "11,...
rvestlibrary(rvest)
ht <- read_html('https://www.google.co.in/search?q=guitar+repair+workshop')
links <- ht %>% html_nodes(xpath='//h3/a') %>% html_attr('href')
gsub('/url\\?q=','',sapply(strsplit(links[as.vector(grep('url',links))],split='&'),'[',1))## [1] "http://theguitarrepairworkshop.com/"
## [2] "http://www.guitarservices.com/"
## [3] "http://www.guitarrepairbench.com/guitar-building-projects/guitar-workshop/guitar-workshop-project.html"
## [4] "http://www.guitarrepairbench.com/guitar-building-projects/guitar-workshop/guitar-workshop-layout-design.html"
## [5] "https://www.facebook.com/The-Guitar-Repair-Workshop-847517635259712/"
## [6] "https://www.taylorguitars.com/dealer/guitar-repair-workshop-ltd"
## [7] "http://guitarworkshopglasgow.com/pages/repairs-1"
## [8] "https://www.justdial.com/Delhi-NCR/Guitar-Repair-Services/nct-10988623"
## [9] "https://www.justdial.com/Mumbai/Guitar-Repair-Services/nct-10988623"
Im Folgenden werde ich zeigen, wie man Textinformationen aus Wikipedia herunterladen, verarbeiten und analysieren kann.
install.packages("NLP")
install.packages("tm")
install.packages("FactoMineR")stringi von Marek Gagolewski and Bartek Tartanus bietet Möglichkeiten zur String Verarbeitung.library("stringi")tm ist ein R-Paket um Text Mining zu realisieren. Es wurde von Ingo Feinerer, Kurt Hornik, und David Meyer geschrieben.library("tm")FactoMineR-Paket, das von Sebastien Le, Julie Josse und Francois Husson zur Durchführung der Hauptkomponentenanalyse erstellt wurde.library("FactoMineR")wiki <- "http://de.wikipedia.org/wiki/"
titles <- c("Zika-Virus", "Influenza-A-Virus_H1N1",
"Spanische_Grippe","Influenzavirus",
"Vogelgrippe_H5N1",
"Legionellose-Ausbruch_in_Warstein_2013",
"Legionellose-Ausbruch_in_Jülich_2014")articles <- character(length(titles))
for (i in 1:length(titles)){
articles[i] <- stri_flatten(
readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))Das Folgende basiert auf einem Blogpost von Norbert Ryciak über die automatische Kategorisierung von Wikipedia-Artikeln.
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(
x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(
x, "\t", " "))docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("german"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, tolower)
# docs8 <- tm_map(docs8, PlainTextDocument)dtm <- DocumentTermMatrix(docs8) dtm2 <- as.matrix(dtm)
frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=TRUE)
words <- frequency[frequency>20]
s <- dtm2[1,which(colnames(dtm2) %in% names(words))]
for(i in 2:nrow(dtm2)){
s <- cbind(s,dtm2[i,which(colnames(dtm2) %in%
names(words))])
}
colnames(s) <- titlesPCA(s)## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 125 individuals, described by 7 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
s0 <- s/apply(s,1,sd)
h <- hclust(dist(t(s0)), method = "ward")
plot(h, labels = titles, sub = "")git commit
git push
http://stackoverflow.com/questions/1125968/force-git-to-overwrite-local-files-on-pull
WinDirStat https://support.microsoft.com/de-de/kb/912997 http://www.pcwelt.de/tipps/Update-Dateien-loeschen-8357046.html
install.packages("Rcpp")library(Rcpp)
cppFunction('int add(int x, int y, int z) {
int sum = x + y + z;
return sum;
}')
# add works like a regular R function
addadd(1, 2, 3)Tutorial on Rcpp by Hadley Wickham
library(Rcpp)cppFunction('int add(int x, int y, int z) {
int sum = x + y + z;
return sum;
}')add(1, 2, 3)RPostgreSQL
# install.packages("RPostgreSQL")
library("RPostgreSQL")sudo -u postgres createuser Japhilko
sudo -u postgres createdb -E UTF8 -O Japhilko offlgeoc
Die postgis Erweiterung muss für die Datenbank installiert werden:
CREATE EXTENSION postgis;
CREATE EXTENSION hstore;
osm2pgsql -s -U postgres -d offlgeoc /home/kolb/Forschung/osmData/data/saarland-latest.osm.pbf
sudo -u postgres createdb -E UTF8 -O Japhilko offlgeocRLP
CREATE EXTENSION postgis;
osm2pgsql -s -U postgres -d offlgeocRLP -o gazetteer /home/kolb/Forschung/osmData/data/rheinland-pfalz-latest.osm.pbf
So bekommt man alle administrativen Grenzen:
SELECT name FROM planet_osm_polygon WHERE boundary='administrative'
pw <- {"1234"}
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "offlgeocRLP",
host = "localhost", port = 5432,
user = "postgres", password = pw)
rm(pw) # removes the password
dbExistsTable(con, "planet_osm_polygon")df_postgres <- dbGetQuery(con, "SELECT name, admin_level FROM planet_osm_polygon WHERE boundary='administrative'")barplot(table(df_postgres[,2]),col="royalblue")df_adm8 <- dbGetQuery(con, "SELECT name, admin_level FROM planet_osm_polygon WHERE boundary='administrative' AND admin_level='8'")library(knitr)
# kable(head(df_adm8))df_hnr <- dbGetQuery(con, "SELECT * FROM planet_osm_line, planet_osm_point
WHERE planet_osm_line.name='Nordring' AND planet_osm_line.highway IN ('motorway','trunk','primary')
AND planet_osm_point.name='Ludwigshafen' AND planet_osm_point.place IN ('city', 'town')
ORDER BY ST_Distance(planet_osm_line.way, planet_osm_point.way)")df_hnr <- dbGetQuery(con, "SELECT * FROM planet_osm_line, planet_osm_point
WHERE planet_osm_line.name='Nordring' AND planet_osm_point.name='Ludwigshafen'
ORDER BY ST_Distance(planet_osm_line.way, planet_osm_point.way)")
head(df_hnr)df_ <- dbGetQuery(con, "SELECT * FROM planet_osm_line, planet_osm_point
WHERE planet_osm_line.name='Nordring' AND planet_osm_point.name='Ludwigshafen'
ORDER BY ST_Distance(planet_osm_line.way, planet_osm_point.way)")
head(df_hnr)colnames(df_)table(df_$name)df_sipp <- dbGetQuery(con, "SELECT * FROM planet_osm_line, planet_osm_point
WHERE planet_osm_line.name='Rechweg' AND planet_osm_point.name='Sippersfeld'
ORDER BY ST_Distance(planet_osm_line.way, planet_osm_point.way)")
head(df_sipp)restnam <- dbGetQuery(con, "SELECT name, COUNT(osm_id) AS anzahl
FROM planet_osm_point
WHERE amenity = 'restaurant'
AND name <> ''
GROUP BY name
ORDER BY anzahl DESC
LIMIT 10")
head(restnam)install.packages("mongolite")library(mongolite)
m <- mongo(collection = "diamonds")